{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- encoding: utf-8 -*-\n",
    "import sys\n",
    "reload(sys)\n",
    "sys.setdefaultencoding('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from keras.layers.core import Activation, Dense\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.recurrent import LSTM\n",
    "from keras.models import Sequential\n",
    "from keras.preprocessing import sequence\n",
    "from sklearn.model_selection import train_test_split\n",
    "import collections  #用来统计词频\n",
    "import logging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 获取训练数据\n",
    "def getTrainSet(inFile):\n",
    "    # 训练集\n",
    "    train_set=[]\n",
    "    # 情感标签集\n",
    "    target_set = []\n",
    "    # 统计所有出现的词\n",
    "    word_ctr = collections.Counter()\n",
    "    # 评论的最大长度\n",
    "    maxlen = 0\n",
    "    len_ctr = collections.Counter()\n",
    "    \n",
    "    # 读入训练数据\n",
    "    f=open(inFile)\n",
    "    lines=f.readlines()\n",
    "    for line in lines:\n",
    "        article = line.replace('\\n','').split('\\t')\n",
    "        \n",
    "        # 情感标签\n",
    "        target_set.append(article[1])\n",
    "        # 内容\n",
    "        content = article[2:]\n",
    "        train_set.append(content)\n",
    "        \n",
    "        # 获得评论的最大长度\n",
    "        if len(content) > maxlen:\n",
    "            maxlen = len(content)\n",
    "        \n",
    "        len_ctr[str(len(content))] += 1\n",
    "        \n",
    "        # 统计所有出现的词\n",
    "        for w in content:\n",
    "            word_ctr[w] += 1\n",
    "\n",
    "    f.close()\n",
    "    print('max_len ',maxlen)\n",
    "    print('nb_words ', len(word_ctr))\n",
    "    print('mean lenth',len_ctr)\n",
    "    return (target_set,train_set,maxlen,word_ctr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 把原始文本转化为由词汇表索引表示的矩阵\n",
    "def trainLSTM(inFile,outFile):\n",
    "    # 读入数据\n",
    "    target_set,data_set,maxlen,word_ctr = getTrainSet(inFile)\n",
    "    \n",
    "    # 创建训练数据\n",
    "    X = np.empty(len(data_set),dtype=list)\n",
    "    y = np.array([int(i) for i in target_set])\n",
    "    \n",
    "#     print \"X len = \",len(X)\n",
    "#     print \"y len = \",len(y)\n",
    "    \n",
    "    # ('max_len ', 172)\n",
    "    # ('nb_words ', 5195)\n",
    "    MAX_FEATURES = 3500\n",
    "    MAX_SENTENCE_LENGTH = 70\n",
    "    \n",
    "    # 对于不在词汇表里的单词，把它们用伪单词 UNK 代替。 \n",
    "    # 根据句子的最大长度 (max_lens)，我们可以统一句子的长度，把短句用 0 填充。\n",
    "    # 接下来建立两个 lookup tables，分别是 word2index 和 index2word，用于单词和数字转换。 \n",
    "    vocab_size = min(MAX_FEATURES, len(word_ctr)) + 2\n",
    "    word2index = {x[0]: i+2 for i, x in enumerate(word_ctr.most_common(MAX_FEATURES))}\n",
    "    word2index[\"PAD\"] = 0\n",
    "    word2index[\"UNK\"] = 1\n",
    "    index2word = {v:k for k, v in word2index.items()}\n",
    "    # 对每一个文章做转换      \n",
    "\n",
    "    i = 0\n",
    "    for news in data_set:\n",
    "        trs_news = []\n",
    "        for w in news:\n",
    "            if w in word2index:\n",
    "                trs_news.append(word2index[w])\n",
    "            else:\n",
    "                trs_news.append(word2index['UNK'])\n",
    "        X[i] = trs_news\n",
    "        i += 1\n",
    "    \n",
    "    # 对文字序列做补齐 ，补齐长度=最长的文章长度 ，补齐在最后，补齐用的词汇默认是词汇表index=0的词汇，也可通过value指定\n",
    "    # 训练好的w2v词表的index = 0 对应的词汇是空格\n",
    "    X = sequence.pad_sequences(X,maxlen=MAX_SENTENCE_LENGTH,padding='post')\n",
    "    \n",
    "    np.save(outFile,np.column_stack([X,y]))\n",
    "    \n",
    "        # 划分数据\n",
    "    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "    \n",
    "   \n",
    "    # 构建网络\n",
    "    HIDDEN_LAYER_SIZE = 64\n",
    "    EMBEDDING_SIZE = 128\n",
    "\n",
    "    model = Sequential()\n",
    "    model.add(Embedding(vocab_size, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))\n",
    "    model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))\n",
    "    model.add(Dense(1))\n",
    "    model.add(Activation(\"sigmoid\"))\n",
    "    model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",metrics=[\"accuracy\"])\n",
    "  \n",
    "    \n",
    "    # 训练网络\n",
    "    BATCH_SIZE = 32\n",
    "    NUM_EPOCHS = 10\n",
    "    model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(Xtest, ytest))\n",
    "    \n",
    "    # 预测 \n",
    "    score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)\n",
    "    print(\"\\nTest score: %.3f, accuracy: %.3f\" % (score, acc))\n",
    "    print('{}   {}      {}'.format('预测','真实','句子'))\n",
    "    for i in range(5):\n",
    "        idx = np.random.randint(len(Xtest))\n",
    "        xtest = Xtest[idx].reshape(1,70)\n",
    "        ylabel = ytest[idx]\n",
    "        ypred = model.predict(xtest)[0][0]\n",
    "        sent = \" \".join([index2word[x] for x in xtest[0] if x != 0])\n",
    "        print(' {}      {}     {}'.format(int(round(ypred)), int(ylabel), sent))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def main():\n",
    "    \n",
    "#     inFile = \"./data/test.txt\"\n",
    "    inFile = \"./data/processed_lstm/all_summary.txt\"\n",
    "    outFile = \"./model/train_lstm.npy\"\n",
    "   \n",
    "    # 把分词以后的文本转化为供LSTM训练的数据文件\n",
    "    trainLSTM(inFile,outFile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
